{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare Datasets\n",
    "\n",
    "## Introduction\n",
    "In this notebook, we will prepare the datasets for the benchmarking of different OpenAI Embedding models for Binary Quantization. We will use the following datasets. We will use a 100K sample of the DBPedia dataset for the benchmarking. \n",
    "\n",
    "## Approach\n",
    "We will use the following approach to prepare the datasets:\n",
    "1. Load the datasets\n",
    "2. Sanitize and Prepare the text\n",
    "3. Compute & Save the embeddings back to the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from typing import List\n",
    "\n",
    "import loguru\n",
    "from datasets import load_dataset\n",
    "from datasets.exceptions import DatasetNotFoundError\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "from tqdm import tqdm\n",
    "\n",
    "load_dotenv()  # take environment variables from .env.\n",
    "\n",
    "logger = loguru.logger\n",
    "logger.add(\"logs.log\", format=\"{time} {level} {message}\", level=\"INFO\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = OpenAI()\n",
    "bs = 1000  # Batch size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "331c4d415ce3496fb0cba45db047cc8f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def sanitize(text: str):\n",
    "    text = text.replace(\"\\n\", \" \")\n",
    "    text = text.replace(\"\\t\", \" \")\n",
    "    text = text.strip()\n",
    "    if len(text) <= 0:\n",
    "        return \" \"\n",
    "    return text\n",
    "\n",
    "\n",
    "def prepare_dataset(dataset_name: str = \"KShivendu/dbpedia-entities-openai-1M\"):\n",
    "    dataset = load_dataset(\"KShivendu/dbpedia-entities-openai-1M\", split=\"train\")\n",
    "    dataset = dataset.shuffle(seed=42)\n",
    "    dataset = dataset.select(range(100000))\n",
    "    dataset = dataset.map(\n",
    "        lambda x: {\"combined_text\": sanitize(f\"{x['title']}\\n{x['text']}\")}\n",
    "    )\n",
    "    combined_text = dataset[\"combined_text\"]\n",
    "\n",
    "    return dataset, combined_text\n",
    "\n",
    "\n",
    "dataset, combined_text = prepare_dataset()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_embedding(texts: List[str], model: str, dimensions: int):\n",
    "    return client.embeddings.create(input=texts, model=model, dimensions=dimensions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_embeddings(\n",
    "    bs: int, combined_text: List[str], MODEL_NAME: str, DIMENSIONS: int\n",
    "):\n",
    "    \"\"\"\n",
    "    This function creates embeddings for a given text using a specified OpenAI model.\n",
    "\n",
    "    Parameters:\n",
    "    - bs (int): The batch size for processing the text.\n",
    "    - MODEL_NAME (str): The name of the model to use for generating embeddings.\n",
    "    - DIMENSIONS (int): The number of dimensions for the embeddings.\n",
    "\n",
    "    Returns:\n",
    "    - response_objects (list): A list of response objects containing the embeddings.\n",
    "\n",
    "    \"\"\"\n",
    "    response_objects = []\n",
    "    for i in tqdm(range(0, len(combined_text), bs)):\n",
    "        response_objects.append(\n",
    "            get_embedding(combined_text[i : i + bs], MODEL_NAME, DIMENSIONS)\n",
    "        )\n",
    "    return response_objects"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Putting it all together"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_combinations = [\n",
    "    # {\n",
    "    #     \"model_name\": \"text-embedding-3-large\",\n",
    "    #     \"dimensions\": 3072,\n",
    "    # },\n",
    "    # {\n",
    "    #     \"model_name\": \"text-embedding-3-large\",\n",
    "    #     \"dimensions\": 1024,\n",
    "    # },\n",
    "    # {\n",
    "    #     \"model_name\": \"text-embedding-3-large\",\n",
    "    #     \"dimensions\": 1536,\n",
    "    # },\n",
    "    # {\n",
    "    #     \"model_name\": \"text-embedding-3-small\",\n",
    "    #     \"dimensions\": 512,\n",
    "    # },\n",
    "    # {\n",
    "    #     \"model_name\": \"text-embedding-3-small\",\n",
    "    #     \"dimensions\": 1024,\n",
    "    # },\n",
    "    {\n",
    "        \"model_name\": \"text-embedding-3-small\",\n",
    "        \"dimensions\": 1536,\n",
    "    },\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2024-02-06 01:27:38.134\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mWorking on Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-100K\u001b[0m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ea1200d774d1463297c06d7b4814bede",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "acc7ef6a4954425497c5c39222e2361e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2cde007100be4678a692f76f6fa61cad",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2b168a405d6547d392549855b16b35c6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7027d007733247c69081680f3e8317c3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f3d034f41a0a43ac86cf698e36a177f5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cfaa77c37c244b34a815466fb6d2bb12",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/421 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2024-02-06 01:28:05.935\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mWorking on Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1024-100K\u001b[0m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a60a1a1f2ddd46f5ad68d1bf802e3596",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "52d28654a7bf4e3cad9d4ed3a934bd96",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1642d5a9481c4ec0abd3e67bc50a19e4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2024-02-06 01:29:07.920\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mWorking on Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-100K\u001b[0m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c2a60761de8b4b9a90eb1182c820a9d1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b7dc6e8f869a443597ba48467f5431b2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "84a8cdc50ea847f0aee2cd32c802d22e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "65c8c8b816834f0d984ef0fb0c3091f8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b77208ce28894183a90ceecdb1821ad3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/420 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2024-02-06 01:29:43.130\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mWorking on Qdrant/dbpedia-entities-openai3-text-embedding-3-small-512-100K\u001b[0m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2e75d3cc940a4ef3991164acf181cce9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading readme:   0%|          | 0.00/490 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b473803f73b14657b12e2661351274eb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/323M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4ec037ac29e3424ea864443dac2d44de",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/323M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fccb0f17fd8a4c858625d1d365df1fa5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/323M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "024e357bf7324f9396cdb4c7674d7fa2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/323M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "34faf8dadfd14f5aa7c1651290cb1c92",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c8cb83d433f24b1ea0ee8ace765387f5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "39de48c9fc424fc686d0e73a8e8ee8c4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5aebfe3f2cd84160ad2d4cce2c84a9f7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eb30ed6c46624aa5ad203ea5613979ab",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "72aa70d8dbc94104968c7010ecf4e805",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a4336d3d57f14a0aa7dedf554772bd51",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/490 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2024-02-06 01:36:15.236\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mWorking on Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1024-100K\u001b[0m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f477c79eff22434ea30e40e712b4b624",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c5ac66e5811e470f8ee46276bbaaef95",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "321f9d93276242ebbb37f49aef05261c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e8af129814ff43e69111325e56f10222",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md:   0%|          | 0.00/460 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2024-02-06 01:37:05.522\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mWorking on Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K\u001b[0m\n"
     ]
    },
    {
     "ename": "EmptyDatasetError",
     "evalue": "The directory at hf://datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K@7e82935b3d4b60d1f14f456025cbe8b934740a73 doesn't contain any data files",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mEmptyDatasetError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 5\u001b[0m\n\u001b[1;32m      3\u001b[0m DATASET_NAME \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mQdrant/dbpedia-entities-openai3-\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mMODEL_NAME\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m-\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDIMENSIONS\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m-100K\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      4\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWorking on \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATASET_NAME\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDATASET_NAME\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mopenai\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m dataset\u001b[38;5;241m.\u001b[39mcolumn_names:\n\u001b[1;32m      7\u001b[0m     dataset \u001b[38;5;241m=\u001b[39m dataset\u001b[38;5;241m.\u001b[39mremove_columns(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mopenai\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m/opt/homebrew/Caskroom/miniconda/base/envs/fst/lib/python3.11/site-packages/datasets/load.py:2523\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[1;32m   2518\u001b[0m verification_mode \u001b[38;5;241m=\u001b[39m VerificationMode(\n\u001b[1;32m   2519\u001b[0m     (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mBASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mALL_CHECKS\n\u001b[1;32m   2520\u001b[0m )\n\u001b[1;32m   2522\u001b[0m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[0;32m-> 2523\u001b[0m builder_instance \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2524\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2525\u001b[0m \u001b[43m    \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2526\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2527\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2528\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2529\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2530\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2531\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2532\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2533\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2534\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2535\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2536\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_require_default_config_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   2537\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2538\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2540\u001b[0m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[1;32m   2541\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
      "File \u001b[0;32m/opt/homebrew/Caskroom/miniconda/base/envs/fst/lib/python3.11/site-packages/datasets/load.py:2195\u001b[0m, in \u001b[0;36mload_dataset_builder\u001b[0;34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, use_auth_token, storage_options, trust_remote_code, _require_default_config_name, **config_kwargs)\u001b[0m\n\u001b[1;32m   2193\u001b[0m     download_config \u001b[38;5;241m=\u001b[39m download_config\u001b[38;5;241m.\u001b[39mcopy() \u001b[38;5;28;01mif\u001b[39;00m download_config \u001b[38;5;28;01melse\u001b[39;00m DownloadConfig()\n\u001b[1;32m   2194\u001b[0m     download_config\u001b[38;5;241m.\u001b[39mstorage_options\u001b[38;5;241m.\u001b[39mupdate(storage_options)\n\u001b[0;32m-> 2195\u001b[0m dataset_module \u001b[38;5;241m=\u001b[39m \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2196\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2197\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2198\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2199\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2200\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2201\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2202\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2203\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2204\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_require_default_config_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_require_default_config_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2205\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_require_custom_configs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mbool\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2206\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2207\u001b[0m \u001b[38;5;66;03m# Get dataset builder class from the processing script\u001b[39;00m\n\u001b[1;32m   2208\u001b[0m builder_kwargs \u001b[38;5;241m=\u001b[39m dataset_module\u001b[38;5;241m.\u001b[39mbuilder_kwargs\n",
      "File \u001b[0;32m/opt/homebrew/Caskroom/miniconda/base/envs/fst/lib/python3.11/site-packages/datasets/load.py:1840\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[0;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)\u001b[0m\n\u001b[1;32m   1838\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt reach the Hugging Face Hub for dataset \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, (DataFilesNotFoundError, DatasetNotFoundError, EmptyDatasetError)):\n\u001b[0;32m-> 1840\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1841\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m):\n\u001b[1;32m   1842\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[1;32m   1843\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find a dataset script at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(combined_path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or any data file in the same directory. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1844\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m on the Hugging Face Hub either: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(e1)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1845\u001b[0m     ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "File \u001b[0;32m/opt/homebrew/Caskroom/miniconda/base/envs/fst/lib/python3.11/site-packages/datasets/load.py:1828\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[0;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)\u001b[0m\n\u001b[1;32m   1812\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m HubDatasetModuleFactoryWithScript(\n\u001b[1;32m   1813\u001b[0m             path,\n\u001b[1;32m   1814\u001b[0m             revision\u001b[38;5;241m=\u001b[39mrevision,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1818\u001b[0m             trust_remote_code\u001b[38;5;241m=\u001b[39mtrust_remote_code,\n\u001b[1;32m   1819\u001b[0m         )\u001b[38;5;241m.\u001b[39mget_module()\n\u001b[1;32m   1820\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1821\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mHubDatasetModuleFactoryWithoutScript\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1822\u001b[0m \u001b[43m            \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1823\u001b[0m \u001b[43m            \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1824\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1825\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1826\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1827\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m-> 1828\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_module\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1829\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e1:\n\u001b[1;32m   1830\u001b[0m     \u001b[38;5;66;03m# All the attempts failed, before raising the error we should check if the module is already cached\u001b[39;00m\n\u001b[1;32m   1831\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n",
      "File \u001b[0;32m/opt/homebrew/Caskroom/miniconda/base/envs/fst/lib/python3.11/site-packages/datasets/load.py:1208\u001b[0m, in \u001b[0;36mHubDatasetModuleFactoryWithoutScript.get_module\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1206\u001b[0m     patterns \u001b[38;5;241m=\u001b[39m sanitize_patterns(\u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28miter\u001b[39m(metadata_configs\u001b[38;5;241m.\u001b[39mvalues()))[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_files\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m   1207\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1208\u001b[0m     patterns \u001b[38;5;241m=\u001b[39m \u001b[43mget_data_patterns\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbase_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1209\u001b[0m data_files \u001b[38;5;241m=\u001b[39m DataFilesDict\u001b[38;5;241m.\u001b[39mfrom_patterns(\n\u001b[1;32m   1210\u001b[0m     patterns,\n\u001b[1;32m   1211\u001b[0m     base_path\u001b[38;5;241m=\u001b[39mbase_path,\n\u001b[1;32m   1212\u001b[0m     allowed_extensions\u001b[38;5;241m=\u001b[39mALL_ALLOWED_EXTENSIONS,\n\u001b[1;32m   1213\u001b[0m     download_config\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload_config,\n\u001b[1;32m   1214\u001b[0m )\n\u001b[1;32m   1215\u001b[0m module_name, default_builder_kwargs \u001b[38;5;241m=\u001b[39m infer_module_for_data_files(\n\u001b[1;32m   1216\u001b[0m     data_files\u001b[38;5;241m=\u001b[39mdata_files,\n\u001b[1;32m   1217\u001b[0m     path\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname,\n\u001b[1;32m   1218\u001b[0m     download_config\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload_config,\n\u001b[1;32m   1219\u001b[0m )\n",
      "File \u001b[0;32m/opt/homebrew/Caskroom/miniconda/base/envs/fst/lib/python3.11/site-packages/datasets/data_files.py:475\u001b[0m, in \u001b[0;36mget_data_patterns\u001b[0;34m(base_path, download_config)\u001b[0m\n\u001b[1;32m    473\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m _get_data_files_patterns(resolver)\n\u001b[1;32m    474\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n\u001b[0;32m--> 475\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m EmptyDatasetError(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe directory at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbase_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m doesn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt contain any data files\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "\u001b[0;31mEmptyDatasetError\u001b[0m: The directory at hf://datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K@7e82935b3d4b60d1f14f456025cbe8b934740a73 doesn't contain any data files"
     ]
    }
   ],
   "source": [
    "for combination in dataset_combinations:\n",
    "    MODEL_NAME, DIMENSIONS = combination[\"model_name\"], combination[\"dimensions\"]\n",
    "    DATASET_NAME = f\"Qdrant/dbpedia-entities-openai3-{MODEL_NAME}-{DIMENSIONS}-100K\"\n",
    "    logger.info(f\"Working on {DATASET_NAME}\")\n",
    "    dataset = load_dataset(DATASET_NAME, split=\"train\")\n",
    "    if \"openai\" in dataset.column_names:\n",
    "        dataset = dataset.remove_columns(\"openai\")\n",
    "    if \"embedding\" in dataset.column_names:\n",
    "        dataset = dataset.rename_column(\n",
    "            \"embedding\", f\"{MODEL_NAME}-{DIMENSIONS}-embedding\"\n",
    "        )\n",
    "    if \"combined_text\" in dataset.column_names:\n",
    "        dataset = dataset.remove_columns(\"combined_text\")\n",
    "    dataset.push_to_hub(DATASET_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "524df43ed3974a488a84abb01b9c5ffb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2024-02-06 02:02:59.784\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m6\u001b[0m - \u001b[1mChecking on Qdrant/dbpedia-entities-openai3-text-embedding-3-small-1536-100K\u001b[0m\n",
      "\u001b[32m2024-02-06 02:03:00.122\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m10\u001b[0m - \u001b[1mCreating embeddings for text-embedding-3-small with 1536 dimensions\u001b[0m\n",
      "100%|██████████| 100/100 [08:56<00:00,  5.36s/it]\n",
      "\u001b[32m2024-02-06 02:11:56.219\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1mEmbeddings created for text-embedding-3-small with 1536 dimensions\u001b[0m\n",
      "\u001b[32m2024-02-06 02:13:26.579\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m<module>\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m100000 Embeddings added to the dataset\u001b[0m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a472d5d0b5c440f0aad7a588d42f970c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9e80cfbe9d3747f7976a6beae09123aa",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "692be8ecabe64b51a171be49f026a3bc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "076ceb08461145c8ac9e94d5edaf09f9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ba988e307efa4a04916ab95dbbd97097",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# if dataset has been created, rename the column to model_name and dimensions format\n",
    "dataset, combined_text = prepare_dataset()\n",
    "for combination in dataset_combinations:\n",
    "    MODEL_NAME, DIMENSIONS = combination[\"model_name\"], combination[\"dimensions\"]\n",
    "    DATASET_NAME = f\"Qdrant/dbpedia-entities-openai3-{MODEL_NAME}-{DIMENSIONS}-100K\"\n",
    "    logger.info(f\"Checking on {DATASET_NAME}\")\n",
    "    try:\n",
    "        dataset = load_dataset(DATASET_NAME, split=\"train\")\n",
    "    except DatasetNotFoundError:\n",
    "        logger.info(\n",
    "            f\"Creating embeddings for {MODEL_NAME} with {DIMENSIONS} dimensions\"\n",
    "        )\n",
    "        response_objects = create_embeddings(\n",
    "            bs,\n",
    "            combined_text=combined_text,\n",
    "            MODEL_NAME=MODEL_NAME,\n",
    "            DIMENSIONS=DIMENSIONS,\n",
    "        )\n",
    "        embedding_responses = [r.data for r in response_objects]\n",
    "        embedding_objects = [\n",
    "            item for sublist in embedding_responses for item in sublist\n",
    "        ]\n",
    "        embeddings = [e.embedding for e in embedding_objects]\n",
    "        logger.info(f\"Embeddings created for {MODEL_NAME} with {DIMENSIONS} dimensions\")\n",
    "        dataset = dataset.add_column(f\"{MODEL_NAME}-{DIMENSIONS}-embedding\", embeddings)\n",
    "        logger.info(f\"{len(embeddings)} Embeddings added to the dataset\")\n",
    "        dataset.push_to_hub(DATASET_NAME)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fastembed",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
